Summary and pre-processing

Experiment was done in three batches: Batch1 samples: NM05N, NM04N1, NH64T Batch2 samples: NH87ND, NH87T, HCM-CSHL-0655-C50 Batch3 samples: multiplexed 4 samples with the following barcodes GACAGTGC HCM-CSHL-0366-C50 GAGTTAGC NH85TSc GATGAATC NH95T GCCAAGAC NH93T

10x Single cell RNA-seq analysis -- pre-processing using cell ranger -- aligned to grch38

load packages

library(Seurat)
library(dplyr)
library(Matrix)
library(magrittr)
library(future.apply)
library(cowplot)
library(hdf5r)
library(stringr)
library(ggplot2)
getwd()

process data files

NH64T= Read10X_h5("/Users/sonambhatia/Documents_local/tnbc_manuscript_r_scripts/data/Bhatia_03-NH64T_filtered_feature_bc_matrix.h5",use.names=TRUE, unique.features = TRUE)
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## read SB04 remapped to grch38

NH87T= Read10X_h5("/Users/sonambhatia/Documents_local/tnbc_manuscript_r_scripts/data/Bhatia_04_10xGEX_NH87TT_filtered_feature_bc_matrix.h5")
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
`HCM-CSHL-0655-C50`= Read10X_h5("/Users/sonambhatia/Documents_local/tnbc_manuscript_r_scripts/data/Bhatia_04_10xGEX_HC_0655_filtered_feature_bc_matrix.h5")
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## read SB06 mapped to grch38

sb06_data= Read10X_h5("/Users/sonambhatia/Documents_local/tnbc_manuscript_r_scripts/data/Bhatia_06_superloaded10xGEX_filtered_feature_bc_matrix.h5",use.names=TRUE, unique.features = TRUE)
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## make Seurat objects
sb06_obj= CreateSeuratObject(sb06_data, min.cells = 3, min.features = 200, project = "sb06_4x")
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# barcodes from Vireo

library(readr)
barcode_key <- read_csv("~/Documents/HumanOrganoid/Bioinformatics/scRNA-seq/Data_from_JPreall/SB06/demultiplex_vireo/barcode_key.csv", 
                        col_names = FALSE)

donor_ids <- read_delim("~/Documents/HumanOrganoid/Bioinformatics/scRNA-seq/Data_from_JPreall/SB06/demultiplex_vireo/vireo/donor_ids.tsv", 
                        "\t", escape_double = FALSE, trim_ws = TRUE)

#GACAGTGC HCM-CSHL-0366-C50

#GAGTTAGC NH85TSc

#GATGAATC NH95T

#GCCAAGAC NH93T

donor_ids$organoid_id= donor_ids$donor_id

`HCM-CSHL-0366-C50_cells` = grep("GACAGTGC",donor_ids$organoid_id)
donor_ids$organoid_id[`HCM-CSHL-0366-C50_cells`]="HCM-CSHL-0366-C50"

NH85TSc_cells = grep("GAGTTAGC",donor_ids$organoid_id)
donor_ids$organoid_id[NH85TSc_cells]="NH85TSc"

NH95TT_cells = grep("GATGAATC",donor_ids$organoid_id)
donor_ids$organoid_id[NH95TT_cells]="NH95T"

NH93T_cells = grep("GCCAAGAC",donor_ids$organoid_id)
donor_ids$organoid_id[NH93T_cells]="NH93T"

summary(as.factor(donor_ids$organoid_id))

## add barcode labels to metadata
tail(donor_ids)
tail(rownames(sb06_obj@meta.data))

myBarcode = rownames(sb06_obj@meta.data) #get barcode from seurat
test = donor_ids[match(myBarcode, donor_ids$cell), ] #match the order of seurat barcode with you data
sb06_obj$vireo_id = test$organoid_id #put cell type into metadata

sb06_obj$vireo_id

make seurat objects for the individual TNBC tumor samples

## make Seurat objects
nh64t_obj= CreateSeuratObject(NH64T, min.cells = 3, min.features = 200, project = "NH64T")
nh87tt_obj= CreateSeuratObject(NH87T, min.cells = 3, min.features = 200, project = "NH87T")
hc_0655_obj= CreateSeuratObject(`HCM-CSHL-0655-C50`, min.cells = 3, min.features = 200, project = "HCM-CSHL-0655-C50")

# Check the metadata in the new Seurat objects
head(hc_0655_obj@meta.data)
head(sb06_obj@meta.data)

sb06_obj@meta.data$test= sb06_obj@meta.data$vireo_id
# set the orig.ident of sb06 as the patient id

sb06_obj@meta.data$orig.ident= sb06_obj@meta.data$vireo_id
head(sb06_obj@meta.data)

##
my.list= Filter(function(x) is(x, "Seurat"), mget(ls()))
names(my.list)

# add tumor vs normal in the metadata as a stimulus for the samples

my.list$nh64t_obj$stim ="tumor"
my.list$hc_0655_obj$stim ="tumor"
my.list$nh87tt_obj$stim ="tumor"
my.list$sb06_obj$stim ="tumor"

*merge seurat objects sources: https://github.com/hbctraining/scRNA-seq/blob/master/lessons/03_SC_quality_control-setup.md https://satijalab.org/seurat/v3.0/merge_vignette.html

merged_seurat <- merge(nh64t_obj, y = c(hc_0655_obj,nh87tt_obj,sb06_obj), 
                       add.cell.ids = c("nh64t_obj","hc_0655_obj","nh87tt_obj","sb06_obj"), project = "merged_all", 
                       merge.data = TRUE)

# Check that the merged object has the appropriate sample-specific prefixes
head(merged_seurat@meta.data)
tail(merged_seurat@meta.data)

# Explore merged metadata
# View(merged_seurat@meta.data)

# Add number of genes per UMI for each cell to metadata
merged_seurat$log10GenesPerUMI <- log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA)

# Compute percent mito ratio
merged_seurat$mitoRatio <- PercentageFeatureSet(object = merged_seurat, pattern = "^MT-")
merged_seurat$mitoRatio <- merged_seurat@meta.data$mitoRatio / 100

View(merged_seurat@meta.data)

# Create metadata dataframe
metadata <- merged_seurat@meta.data

# Add cell IDs to metadata
metadata$cells <- rownames(metadata)

# Rename columns
metadata <- metadata %>%
  dplyr::rename(seq_folder = orig.ident,
                nUMI = nCount_RNA,
                nGene = nFeature_RNA)

# Create sample column
metadata$sample <- NA
metadata$sample[which(str_detect(metadata$seq_folder, "HCM-CSHL-0655-C50"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH87T"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH64T"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH93T"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH95T"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH85TSc"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "HCM-CSHL-0366-C50"))] <- "tumor"


# Add metadata back to Seurat object
merged_seurat@meta.data <- metadata

# Create .RData object to load at any time
save(merged_seurat, file="merged_seurat_tumor.RData")

Visualize the number of cell counts per sample

#pdf("tumor_total_cell_numbers.#pdf")
metadata %>% 
  ggplot(aes(x=sample, fill=sample)) + 
  geom_bar() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust=0.5, face="bold")) +
  ggtitle("NCells")

#dev.off()

#pdf("tumor_total_cell_numbers_sample_type.#pdf")
metadata %>% 
  ggplot(aes(x=seq_folder, fill=seq_folder)) + 
  geom_bar() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust=0.5, face="bold")) +
  ggtitle("NCells")

#dev.off()


# Visualize the number UMIs/transcripts per cell
#pdf("tumor_total_cell_numbers_UMI_per_cell.#pdf")
metadata %>% 
  ggplot(aes(color=sample, x=nUMI, fill= sample)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

#dev.off()


#pdf("tumor_total_cell_numbers_UMI_per_cell_type.#pdf")
metadata %>% 
  ggplot(aes(color=seq_folder, x=nUMI, fill= seq_folder)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500)

#dev.off()



# Visualize the distribution of genes detected per cell via histogram
#pdf("tumor_total_cell_numbers_genes_per_cell.#pdf")
metadata %>% 
  ggplot(aes(color=sample, x=nGene, fill= sample)) + 
  geom_density(alpha = 0.2) + 
  theme_classic() +
  scale_x_log10() + 
  geom_vline(xintercept = 300)

#dev.off()



# Visualize the distribution of genes detected per cell via boxplot
#pdf("tumor_total_cell_numbers_genes_per_cell_type.#pdf")
metadata %>% 
  ggplot(aes(x=seq_folder, y=log10(nGene), fill=seq_folder)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust=0.5, face="bold")) +
  ggtitle("NCells vs NGenes")

#dev.off()



# Visualize the correlation between genes detected and number of UMIs 
# Are there cells present with low numbers of genes/UMIs?
#pdf("tumor_correlation_genes_vs_num_of_UMIs.#pdf")
metadata %>% 
  ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + 
  geom_point() + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 250) +
  facet_wrap(~sample)
## `geom_smooth()` using formula 'y ~ x'

#dev.off()


#pdf("tumor_correlation_genes_vs_num_of_UMIs_celltype.#pdf")
metadata %>% 
  ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + 
  geom_point() + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 250) +
  facet_wrap(~seq_folder)
## `geom_smooth()` using formula 'y ~ x'

#dev.off()


# Visualize the distribution of mitochondrial gene expression detected per cell
#pdf("tumor_correlation_MTgenes_vs_cell.#pdf")
metadata %>% 
  ggplot(aes(color=sample, x=mitoRatio, fill=sample)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 0.2)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 194 rows containing non-finite values (stat_density).

#dev.off()

#pdf("tumor_correlation_MTgenes_vs_cell_celltype.#pdf")
metadata %>% 
  ggplot(aes(color=seq_folder, x=mitoRatio, fill=seq_folder)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 0.2)
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 194 rows containing non-finite values (stat_density).

#dev.off()



# Visualize the overall complexity of the gene expression by visualizing the genes detected per UMI
#pdf("tumor_complexity_genes_per_UMI.#pdf")
metadata %>%
  ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  geom_vline(xintercept = 0.8)

#dev.off()

#pdf("tumor_complexity_genes_per_UMI_celltype.#pdf")
metadata %>%
  ggplot(aes(x=log10GenesPerUMI, color = seq_folder, fill=seq_folder)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  geom_vline(xintercept = 0.8)

#dev.off()

Filter out low quality reads using selected thresholds

use the merged and filtered object for all downstream analysis

filtered_seurat <- subset(x = merged_seurat, 
                          subset= (nUMI >= 500) & 
                            (nGene >= 250) & 
                            (log10GenesPerUMI > 0.80) & 
                            (mitoRatio < 0.20))

# Output a logical vector for every gene on whether the more than zero counts per cell
# Extract counts
counts <- GetAssayData(object = filtered_seurat, slot = "counts")

# Output a logical vector for every gene on whether the more than zero counts per cell
nonzero <- counts > 0

# Sums all TRUE values and returns TRUE if more than 10 TRUE values per gene
keep_genes <- Matrix::rowSums(nonzero) >= 10

# Only keeping those genes expressed in more than 10 cells
filtered_counts <- counts[keep_genes, ]

# Reassign to filtered Seurat object
filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = filtered_seurat@meta.data)

# Save filtered subset to new metadata
metadata_clean <- filtered_seurat@meta.data

### Plot after filtering

#pdf("tumor_complexity_genes_per_UMI_FILTERED.#pdf")
metadata_clean %>%
  ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  geom_vline(xintercept = 0.8)

#dev.off()


#pdf("tumor_correlation_MTgenes_vs_cell_FILTERED.#pdf")
metadata_clean %>% 
  ggplot(aes(color=sample, x=mitoRatio, fill=sample)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 0.2)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 194 rows containing non-finite values (stat_density).

#dev.off()

#pdf("tumor_correlation_genes_vs_num_of_UMIs_FILTERED.#pdf")
metadata_clean %>% 
  ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + 
  geom_point() + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 250) +
  facet_wrap(~sample)
## `geom_smooth()` using formula 'y ~ x'

#dev.off()

#pdf("tumor_correlation_genes_vs_num_of_UMIs_celltype_FILTERED.#pdf")
metadata_clean %>% 
  ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + 
  geom_point() + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 250) +
  facet_wrap(~seq_folder)
## `geom_smooth()` using formula 'y ~ x'

#dev.off()


#pdf("tumor_total_cell_numbers_genes_per_cell_FILTERED.#pdf")
metadata_clean %>% 
  ggplot(aes(color=sample, x=nGene, fill= sample)) + 
  geom_density(alpha = 0.2) + 
  theme_classic() +
  scale_x_log10() + 
  geom_vline(xintercept = 300)

#dev.off()

#pdf("tumor_correlation_genes_vs_num_of_UMIs_celltype_FILTERED.#pdf")
metadata_clean %>% 
  ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + 
  geom_point() + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 250) +
  facet_wrap(~seq_folder)
## `geom_smooth()` using formula 'y ~ x'

#dev.off()


#pdf("tumor_total_cell_numbers_genes_per_cell_type_FILTERED.#pdf")
metadata_clean %>% 
  ggplot(aes(x=seq_folder, y=log10(nGene), fill=seq_folder)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust=0.5, face="bold")) +
  ggtitle("NCells vs NGenes")

#dev.off()


# Create .RData object to load at any time
save(filtered_seurat, file="seurat_filtered_tumor.RData")

saveRDS(filtered_seurat, file="seurat_filtered_tumor.rds")

Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] readr_1.4.0        ggplot2_3.3.3      stringr_1.4.0      hdf5r_1.3.3       
##  [5] cowplot_1.1.1      future.apply_1.7.0 future_1.21.0      magrittr_2.0.1    
##  [9] Matrix_1.3-2       dplyr_1.0.5        Seurat_3.2.3       knitr_1.31        
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15           colorspace_2.0-0     deldir_0.2-10       
##   [4] ellipsis_0.3.1       ggridges_0.5.3       rstudioapi_0.13     
##   [7] spatstat.data_2.0-0  farver_2.1.0         leiden_0.3.7        
##  [10] listenv_0.8.0        ggrepel_0.9.1        bit64_4.0.5         
##  [13] fansi_0.4.2          codetools_0.2-18     splines_3.6.3       
##  [16] polyclip_1.10-0      jsonlite_1.7.2       ica_1.0-2           
##  [19] cluster_2.1.1        png_0.1-7            uwot_0.1.10         
##  [22] shiny_1.6.0          sctransform_0.3.2    compiler_3.6.3      
##  [25] httr_1.4.2           assertthat_0.2.1     fastmap_1.1.0       
##  [28] lazyeval_0.2.2       cli_2.3.1            later_1.1.0.1       
##  [31] htmltools_0.5.1.1    tools_3.6.3          rsvd_1.0.3          
##  [34] igraph_1.2.6         gtable_0.3.0         glue_1.4.2          
##  [37] RANN_2.6.1           reshape2_1.4.4       Rcpp_1.0.6          
##  [40] spatstat_1.64-1      scattermore_0.7      jquerylib_0.1.3     
##  [43] vctrs_0.3.6          nlme_3.1-152         lmtest_0.9-38       
##  [46] xfun_0.21            globals_0.14.0       mime_0.10           
##  [49] miniUI_0.1.1.1       lifecycle_1.0.0      irlba_2.3.3         
##  [52] goftest_1.2-2        MASS_7.3-53.1        zoo_1.8-9           
##  [55] scales_1.1.1         hms_1.0.0            promises_1.2.0.1    
##  [58] spatstat.utils_2.0-0 parallel_3.6.3       RColorBrewer_1.1-2  
##  [61] yaml_2.2.1           reticulate_1.18      pbapply_1.4-3       
##  [64] gridExtra_2.3        sass_0.3.1           rpart_4.1-15        
##  [67] stringi_1.5.3        highr_0.8            rlang_0.4.10        
##  [70] pkgconfig_2.0.3      matrixStats_0.58.0   evaluate_0.14       
##  [73] lattice_0.20-41      ROCR_1.0-11          purrr_0.3.4         
##  [76] tensor_1.5           labeling_0.4.2       patchwork_1.1.1     
##  [79] htmlwidgets_1.5.3    bit_4.0.4            tidyselect_1.1.0    
##  [82] parallelly_1.23.0    RcppAnnoy_0.0.18     plyr_1.8.6          
##  [85] R6_2.5.0             generics_0.1.0       DBI_1.1.1           
##  [88] pillar_1.5.1         withr_2.4.1          mgcv_1.8-34         
##  [91] fitdistrplus_1.1-3   survival_3.2-7       abind_1.4-5         
##  [94] tibble_3.1.0         crayon_1.4.1         KernSmooth_2.23-18  
##  [97] utf8_1.1.4           plotly_4.9.3         rmarkdown_2.7       
## [100] grid_3.6.3           data.table_1.14.0    digest_0.6.27       
## [103] xtable_1.8-4         tidyr_1.1.3          httpuv_1.5.5        
## [106] munsell_0.5.0        viridisLite_0.3.0    bslib_0.2.4